This is my data visualization portfolio.

I gathered code I wrote to create some nice plots, put them in an Rmarkdown to create an HTML page where I would have an easy and centralized access to all of those code lines. The goal is to preview the graphs so I can identify the features I want to re-use while creating a new plot, and to be able to copy and paste the chunks I want.

The .Rmd file can run by itself, all the data used to draw the plots are either simulated, arbitrarily defined, or from a dataset.

1 Packages

Below is the list of all the required packages. Mostly graphical tools, along with some statistical analysis ressources.

# Tidy data
library(tidyverse)
library(lubridate)

# Supplementary analysis
library(survival)
library(survminer)
library(TraMineR)
library(rstatix)
library(epiR)

# Data
library(ISLR)

# plotly
library(plotly)

# ggplot2 and addins
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(gridExtra)
library(ggseqplot)
library(ggh4x)
library(ggstatsplot)

# Other graphics tools
library(treemap)

2 Basic plots

Starting with the basics : histograms, barplots, pie charts…

Basic is good, but make it pretty.

2.1 Barplots

2.1.1 Frequencies

An horizontal and stacked barplot :

# Simulate data for a barplot
dataBarplot <- data.frame(
  Episode = rep(c("First Covid-19 episode", "Last Covid-19 episode"), each = 63), # Two Covid episodes
  Time = factor(rep(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 21), 2),
                levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
  Symptoms = rep(c("Abdominal pain", "Anxiety", "Chest pain", "Cognitive disfunction", "Cough", # List of possible symptoms
                          "Depression", "Dizziness", "Fatigue", "Fever", "Gastrointestinal problem",
                          "Headache", "Joint pain", "Loss of smell or taste", "Menstruation issue", "Neuralgia",
                          "New allergies", "Shortness of breath", "Sleeping disorder", "Tachycardia", "Tinnitus",
                          "Troubled vision"), 6),
    N = c(0, 0, 0, 0, 2, 5, 9, 285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
          3985, 4921, 3529, 0, 0, 0, 0, 0, 0, 3, 38,  51,  49,  82, 116, 119, 287, 142, 448, 
          190, 336, 158,  81, 502, 0, 1, 4, 6,  12,  12, 6, 315, 243, 192, 232, 212, 930, 394, 
          394, 902, 781, 351, 460,  92,  1318, 0, 0, 0, 0, 0, 0, 0,  71,  62, 119, 148, 162, 
          217, 123, 427, 347, 520, 669, 825, 730, 574, 0, 0, 0, 0, 1, 0, 0, 3, 7,  13, 7,  10,  
          22, 8,  13,  40,  15,  34,  18,  17,  36, 0, 1, 1, 2, 1, 2, 2, 61, 67, 59, 51, 58, 85, 
          217, 84, 200, 172, 79, 18, 115, 296)) %>%
  # Total number of cases per symptoms and episode
  mutate(Ntot = c(rep(by(N[Episode == "First Covid-19 episode"], Symptoms[Episode == "First Covid-19 episode"], sum), 3),
                  rep(by(N[Episode == "Last Covid-19 episode"], Symptoms[Episode == "Last Covid-19 episode"], sum), 3)))



# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) + 
  # Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
  geom_col(
    mapping = aes(N, reorder(Symptoms, -N), fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
    width = 0.6,                              # Set the bar width
    position = "stack"                        # Stack bars by the 'fill' variable
  ) +
  # Create facets (separate panels) for each level of the 'Episode' variable
  facet_grid(cols = vars(Episode))  +
  # Add a vertical line at x = 0 for reference
  geom_vline(aes(xintercept = 0)) + 
  # Customize the x-axis scale
  scale_x_continuous(
    limits = c(0, 6500),                     # Set the range for the x-axis
    breaks = seq(0, 6000, 1000),             # Major tick marks every 1000 units
    minor_breaks = seq(500, 6500, 500),      # Minor tick marks every 500 units
    expand = c(0, 0)                         # Remove padding on the x-axis
  ) + 
  scale_fill_manual(
    name = "Symptoms duration",
    values = c('#28AFB0',  '#F6803C',  '#82354F')
  ) +
  # Add text labels to display 'Ntot' values next to each bar
  geom_text(
    mapping = aes(Ntot + 25, y = Symptoms, label = Ntot), # Position and content of the text
    hjust = 0,                                            # Align text to the left of its position
    nudge_x = 1                                           # Slightly nudge text horizontally
  ) +
  # Add titles and subtitles (empty here for now)
  labs(title = "", subtitle = "") + 
  # Customize the overall appearance of the plot
  theme(
    axis.title = element_blank(),                      # Remove axis titles
    panel.background = element_rect(fill = "white"),   # Set panel background to white
    panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
    panel.grid.minor.x = element_line(colour = "lightgray"), # Minor grid lines for x-axis
    legend.position = "bottom"
  )

This one is vertical and dodged :

# Simulate data for a barplot
dataBarplot <- data.frame(
  Time = factor(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 14),
                levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
  Symptoms = rep(c("Fatigue", "Loss of smell or taste", "Menstruation issue", "Neuralgia",  "Fever", "Gastrointestinal problem",
                          "Headache", "Sleeping disorder", "Tachycardia", "Joint pain",
                          "New allergies", "Shortness of breath", "Tinnitus",
                          "Troubled vision"), 3),
    N = c(285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
          3985, 4921, 3529, 38,  51,  49,  82, 116, 119, 287, 142, 448, 
          190, 336, 158,  81, 502, 315, 243, 192, 232, 212, 930, 394, 
          394, 902, 781, 351, 460,  92,  1318))


# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) + 
  # Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
  geom_col(
    mapping = aes(reorder(Symptoms, -N), N, fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
    width = 0.9,                              # Set the bar width
    position = "dodge"                        # Stack bars by the 'fill' variable
  ) +
  # Add a vertical line at x = 0 for reference
  geom_hline(aes(yintercept = 0)) + 
  # Customize the x-axis scale
  scale_y_continuous(
    limits = c(0, 5500),                     # Set the range for the x-axis
    breaks = seq(0, 5000, 1000),             # Major tick marks every 1000 units
    minor_breaks = seq(500, 5500, 500),      # Minor tick marks every 500 units
    expand = c(0, 0)                         # Remove padding on the x-axis
  ) + 
  scale_fill_manual(
    name = "Symptoms duration",
    values = c('#28AFB0',  '#F6803C',  '#82354F')
  ) +
  # Add text labels to display 'Ntot' values next to each bar
  geom_text(
    mapping = aes(x = Symptoms, y = N, label = N), # Position and content of the text
    nudge_x = rep(c(-0.3, 0, 0.3), each = 14),
    nudge_y = 120, 
    size = 2
  ) +
  # Add titles and subtitles (empty here for now)
  labs(title = "", subtitle = "") + 
  # Customize the overall appearance of the plot
  theme(
    axis.title = element_blank(),                      # Remove axis titles
    panel.background = element_rect(fill = "white"),   # Set panel background to white
    panel.grid.major.y = element_line(color = "darkgray"), # Major grid lines for x-axis
    panel.grid.minor.y = element_line(colour = "lightgray"), # Minor grid lines for x-axis
    legend.position = "bottom",
    axis.text.x = element_text(
      size = 10,                               # Font size
      angle = 45,                              # Rotate text 45 degrees
      vjust = 1,                               # Vertical alignment
      hjust = 1                                # Horizontal alignment
    )
  )

2.1.2 Proportions

I like to display percentages as stacked barplots, to make it obvious that values sum up to a hundred.

This one has and grid according to the status, and I feel like it is easier to read the proportions differences whenever the bars are horizontal :

# Simulate data for a bar plot
dataBarplot <- data.frame(
  # Create a "Status" column with repeated categories (12 repetitions for each category)
  Status = rep(c("Family member", "Active worker", "Retired worker"), each = 12),
  # Create an "Affection" column, categorizing Covid-19 status, with ordered factor levels
  Affection = factor(
    rep(rep(c("No Covid-19", "Covid-19", "Long Covid-19"), each = 4), 3),
    levels = c("No Covid-19", "Covid-19", "Long Covid-19")
  ),
  # Create a "Social_satisfaction" column, representing satisfaction levels, with ordered factor levels
  Social_satisfaction = factor(
    rep(c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"), 9),
    levels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied")
  ),
  # Define a numeric vector "N" representing counts corresponding to the combinations above
  N = c(65, 255, 70, 16, 38, 131, 41, 7, 21, 123, 59, 10,
        819, 2532, 591, 130, 640, 1723, 357, 38, 235, 1134, 468, 68,
        1444, 4212, 677, 128, 528, 1198, 164, 20, 160, 575, 166, 27)
) %>%
  # Add new columns using dplyr's mutate function
  mutate(
    # Compute the total count (Ntot) for each "Affection" group within each "Status" group
    Ntot = c(
      # For "Family member", calculate totals per "Affection" group
      rep(by(N[Status == "Family member"], Affection[Status == "Family member"], sum), each = 4),
      # For "Active worker", calculate totals per "Affection" group
      rep(by(N[Status == "Active worker"], Affection[Status == "Active worker"], sum), each = 4),
      # For "Retired worker", calculate totals per "Affection" group
      rep(by(N[Status == "Retired worker"], Affection[Status == "Retired worker"], sum), each = 4)
    ),
    # Calculate percentages (P) for each count (N) relative to the total count (Ntot) per group
    P = N / Ntot * 100
  )





# Initialize the plot with the dataset (DATA_ex2_2)
ggplot(dataBarplot) + 
  # Add a stacked bar chart
  geom_col(
    mapping = aes(P, Affection, fill = Social_satisfaction), # Map 'P' to x, 'COVID_long_3mod' to y, and 'Q_E1' to fill
    width = 0.6,                                   # Set the bar width
    position = "stack",                            # Stack bars by the 'fill' variable
    color = "white"                                # Add white border to bars
  ) +
  # Create facets (separate rows) for each level of the Status variable
  facet_grid(rows = vars(Status)) +
  # Add vertical lines at x = 0 and x = 100 for reference
  geom_vline(aes(xintercept = 0)) + 
  geom_vline(aes(xintercept = 100)) + 
  # Customize the x-axis scale
  scale_x_continuous(
    limits = c(0, 100.2),                        # Set the range for the x-axis
    breaks = seq(0, 100, 10),                    # Major tick marks every 10%
    minor_breaks = seq(5, 95, 10),              # Minor tick marks every 5%
    expand = c(0, 0),                            # Remove padding on the x-axis
    labels = paste0(seq(0, 100, 10), "%")       # Append "%" to the tick labels
  ) + 
  # Add titles and subtitles (empty here for now)
  labs(title = " ", subtitle = "") + 
  # Customize the overall appearance of the plot
  theme(
    axis.title = element_blank(),                      # Remove axis titles
    # axis.text.x = element_blank(),                  # Uncomment to hide x-axis labels
    # axis.text.y = element_blank(),                  # Uncomment to hide y-axis labels
    panel.background = element_rect(fill = "white"),   # Set panel background to white
    panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
    panel.grid.minor.x = element_line(colour = "lightgray") # Minor grid lines for x-axis
  ) +
  # Customize the fill legend for the bar chart
  scale_fill_manual(
    "Social interactions",                               # Title for the legend
    values = rev(RColorBrewer::brewer.pal(4, "RdYlGn")), # Reverse color palette
    labels = c(                                        # Define labels for legend items
      "Very satisfied",                           # Very satisfactory
      "Satisfied",                         # Somewhat satisfactory
      "Unsatisfied",                       # Somewhat unsatisfactory
      "Very unsatisfied"                     # Very unsatisfactory
    )
  )

Adding some percentages labels and handling long variables names :

# Create the data frame for the bar plot
dataBarplot <- data.frame(
  # "LTI" column with long-term illness categories repeated for each age group
  LTI = factor(rep(
    c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis", 
      "Parkinson & affiliated", "Mental Illness"), 5),
    levels = c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis", 
               "Parkinson & affiliated", "Mental Illness")
  ),
  
  # "Age" column with age group categories repeated for each illness type
  Age = factor(rep(
    c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60"), each = 6),
    levels = c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60")
  ),
  
  # Randomly generated values for the "N" column representing counts
  N = sample(100:300, 30)
) %>% 
  # Calculate the percentage contribution of each count (N) within each illness type (LTI)
  mutate(
    percent = as.numeric(unlist(by(N, LTI, function(x) {x / sum(x) * 100})))[rep(seq(1, 26, 5), 5) + rep(0:4, each = 6)],
    # Label for percentages, shown only if the value exceeds 2%
    lab = ifelse(percent > 2, paste0(formatC(percent, 1, format = "f"), "%"), "")
  )

# Initialize a vector for positioning text labels within each bar
x_text <- rep(NA, nrow(dataBarplot))

# Calculate the cumulative y-position for placing the text labels
for (i in unique(dataBarplot$LTI)) {
  for (j in 1:sum(dataBarplot$LTI == i)) {
    ifelse(j == 1,
           x_text[which(dataBarplot$LTI == i)[j]] <- dataBarplot$percent[dataBarplot$LTI == i][j] / 2,
           x_text[which(dataBarplot$LTI == i)[j]] <- sum(dataBarplot$percent[dataBarplot$LTI == i][1:(j-1)]) +
                          dataBarplot$percent[dataBarplot$LTI == i][j] / 2)
  }
}
dataBarplot$x_text <- 100 - x_text  # Adjust text position to align with the plot

# Generate the bar plot
ggplot(data = dataBarplot) +
  # Create stacked bars for proportions by age group
  geom_col(
    mapping = aes(x = LTI, y = percent, fill = Age),
    width = 0.6,           # Bar width
    position = "stack",    # Stacked bar position
    color = "white"        # Bar border color
  ) +
  
  # Add percentage labels to each segment of the bar
  geom_text(
    aes(x = LTI, y = x_text, label = lab),
    size = 2,              # Font size
    colour = "white"       # Text color
  ) +
  
  # Add titles and axis labels
  labs(
    title = "Long term illnesses per age group", # Title
    ylab = "Proportion",                        # Y-axis label
    xlab = ""                                   # X-axis label (empty)
  ) +
  
  # Customize the y-axis scale
  scale_y_continuous(
    breaks = seq(0, 100, 25),                   # Tick intervals
    labels = c("0%", "25%", "50%", "75%", "100%") # Custom tick labels
  ) +
  
  # Customize the fill colors for the "Age" categories
  scale_fill_manual(
    values = c('#1E5471', '#28AFB0', '#F6803C', '#82354F', '#E0D6FF'),
    name = "Classe ATC1"                       # Legend title
  ) +
  
  # Apply a minimal theme to the plot
  theme_minimal() +
  
  # Customize the legend and x-axis text
  theme(
    legend.position = "right",                 # Position the legend on the right
    axis.text.x = element_text(
      size = 10,                               # Font size
      angle = 45,                              # Rotate text 45 degrees
      vjust = 1,                               # Vertical alignment
      hjust = 1                                # Horizontal alignment
    )
  )

2.1.3 Incidence rates

A very basic one. But adding a second axis with error-bars representing the incidence rate CI95% :

# Define maximum y-axis value and coefficient for secondary y-axis scaling
ymax <- 200
coeff <- 70 / 200

# Create data frame for bar plot
dataBarplot <- data.frame(
  year = 2012:2022,                             # Years from 2012 to 2022
  n_cases = c(120, 87, 78, 96, 117, 153, 139, 164, 181, 188, 163), # Number of cases per year
  pop = rep(300000, 11)                         # Constant population size of 300,000
) %>%
  mutate(
    # Calculate incidence rate (IR) per 100,000 population
    IR = n_cases / pop * 100000,
    # Calculate confidence intervals for the incidence rate
    IR_lower = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 2] * 100000,
    IR_upper = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 3] * 100000
  )

# Create the plot
ggplot(dataBarplot, aes(x = year, y = n_cases, width = 0.95)) +
  # Add bars for number of cases
  geom_col(
    mapping = aes(fill = "Incidence"),
    fill = "#AAC0AF" # Bar color
  ) +
  # Add a line for incidence rate (IR)
  geom_line(
    aes(
      x = year,
      y = IR,
      colour = "Incidence rate" # Legend label for line
    ),
    linetype = 1,               # Solid line
    size = 1,                   # Line thickness
    colour = "black"            # Line color
  ) +
  # Add error bars for confidence intervals of the incidence rate
  geom_errorbar(
    aes(
      x = year,
      ymin = IR_lower,           # Lower bound of the confidence interval
      ymax = IR_upper,           # Upper bound of the confidence interval
      width = 0.2                # Error bar width
    )
  ) +
  # Add text labels for the number of cases above each bar
  geom_text(
    aes(x = year, y = n_cases, label = n_cases),
    nudge_y = 7,                # Offset text above the bars
    size = 6                    # Font size
  ) +
  # Customize the y-axis
  scale_y_continuous(
    expand = c(0, 0),           # Remove padding around the axis
    limits = c(0, ymax),        # Set y-axis limits
    breaks = seq(0, ymax, 25),  # Major tick marks
    name = "Incidence",         # Left y-axis label
    sec.axis = sec_axis(
      trans = ~ . * coeff,      # Transformation for secondary axis
      name = "Incidence rate",  # Right y-axis label
      breaks = seq(0, ymax * coeff, 10) # Tick marks for secondary axis
    )
  ) +
  # Customize the x-axis
  scale_x_continuous(
    breaks = 2012:2022,         # Tick marks for each year
    labels = 2012:2022,         # Labels for each year
    name = ""                   # Empty label for the x-axis
  ) +
  # Apply a classic theme
  theme_classic() +
  # Customize grid lines, axis text, and axis colors
  theme(
    panel.grid.major.y = element_line(colour = "grey"),    # Major grid lines
    panel.grid.minor.y = element_line(colour = "lightgray"), # Minor grid lines
    axis.text.x = element_text(size = 11),                # Font size for x-axis labels
    axis.line.y.left = element_line(color = "#AAC0AF"),   # Color for left y-axis line
    axis.title.y.left = element_text(color = "#AAC0AF"),  # Color for left y-axis title
    axis.text.y.left = element_text(color = "#AAC0AF")    # Color for left y-axis labels
  ) +
  # Add titles and remove legend title
  labs(
    title = "",                     # Plot title
    fill = "",                      # Legend title (empty)
    x = " ",                        # x-axis title (empty)
    y = " "                         # y-axis title (empty)
  )

2.2 Line charts

Let’s start with a pretty basic double axis line chart :

# Create a dataset for line chart visualization
dataLinechart <- data.frame(
  year = seq.Date(as.Date("2015-01-01"), as.Date("2023-01-01"), by = "year"),  # Sequence of years from 2015 to 2023
  n_tlc = c(545268, 668497, 551234, 619147, 582365, 854256, 968214, 1023578, 945127),  # Number of teleconsultations per year
  prop_disp = c(0.23, 0.25, 0.24, 0.28, 0.25, 0.35, 0.38, 0.41, 0.37)  # Proportion of teleconsultations with dispensing
) %>%
  mutate(
    n_tlc_disp = n_tlc * prop_disp  # Calculate teleconsultations with dispensing
  )

# Define the maximum value for y-axis scaling
ymax <- 1250000

# Create the plot using ggplot2
ggplot(dataLinechart, aes(x = year)) +
  # Add line and points for teleconsultations with dispensing
  geom_line(aes(y = n_tlc_disp, color = "Teleconsultations with dispensing", lty = "Teleconsultations with dispensing")) +
  geom_point(aes(x = year, y = n_tlc_disp, color = "Teleconsultations with dispensing"), size = 1.0) +
  
  # Add line and points for total teleconsultations
  geom_line(aes(y = n_tlc, color = "Teleconsultations", lty = "Teleconsultations")) +
  geom_point(aes(x = year, y = n_tlc, color = "Teleconsultations"), size = 1.0) +
  
  # Add line and points for the ratio (scaled proportion)
  geom_line(aes(y = (prop_disp * ymax), color = "Ratio", lty = "Ratio")) +
  geom_point(aes(x = year, y = (prop_disp * ymax), color = "Ratio"), size = 1.0) +
  
  # Set labels for the legend
  labs(color = "", lty = "") +
  
  # Customize the colors for each line
  scale_colour_manual(values = c(
    "Teleconsultations with dispensing" = "#be123c",
    "Teleconsultations" = "#1e3a8a",
    "Ratio" = "#64748b"
  )) +
  
  # Customize the line types for each line
  scale_linetype_manual(values = c(
    "Teleconsultations with dispensing" = 1,
    "Teleconsultations" = 1,
    "Ratio" = 2
  )) +
  
  # Adjust the legend to remove unnecessary symbols
  guides(linetype = guide_legend(override.aes = list(shape = NA))) +
  
  # Set up the y-axis with dual scales (primary and secondary)
  scale_y_continuous(
    breaks = seq(0, ymax, 250000),  # Primary axis breaks
    labels = formatC(seq(0, ymax, 250000), digits = 0, format = "f", big.mark = " "),  # Format for primary axis
    sec.axis = sec_axis(
      ~ . / ymax,  # Scale the secondary axis relative to ymax
      name = "Ratio of the number of teleconsultations \nwith dispensing to the total number of dispensations",
      labels = c("0%", "25%", "50%", "75%", "100%")  # Labels for the secondary axis
    ),
    expand = c(0, 0)
  ) +
  
  # Set the range for the y-axis
  coord_cartesian(ylim = c(0, ymax)) +
  
  # Customize the x-axis with year labels
  scale_x_date(
    breaks = seq.Date(as.Date("2015-01-01"), as.Date("2023-01-01"), by = "year"),
    labels = format.Date(seq.Date(as.Date("2015-01-01"), as.Date("2023-01-01"), by = "year"), "%Y")
  ) +
  
  # Add labels for axes
  labs(
    x = "",
    y = "Number of teleconsultations"
  ) +
  
  # Apply a minimal theme and customize the plot appearance
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.x = element_line(colour = "black", size = 0.3),
    axis.ticks.x = element_line(colour = "black", size = 0.3),
    axis.line.y = element_line(colour = "black", size = 0.3),
    axis.ticks.y = element_line(colour = "black", size = 0.3),
    legend.position = c(0.25, 0.85),  # Position the legend
    legend.background = element_rect(fill = "transparent", color = "white")  # Add transparent background for legend
  )

2.3 Boxplot

I don’t really enjoy boxplots as I feel they get quite boring. But you can add some twists to make it interesting !

2.3.1 Classic

Basic setup :

# Create a data frame for boxplot visualization
dataBoxplot <- data.frame(
  med_id = rep(1:8, c(8, 3, 3, 5, 7, 6, 4, 11)),  # Assign medical operator IDs, repeated based on surgery counts
  op_id = c(1:8, 1:3, 1:3, 1:5, 1:7, 1:6, 1:4, 1:11),  # Assign operation IDs for each surgery
  setup = pmax(10 - rpois(47, lambda = 4), 1)  # Simulate 'setup' scores, capped at 10 and minimum of 1
)

# Create a data frame for the line plot (average setup values per operation ID)
dataLine <- data.frame(
  op_id = 1:max(dataBoxplot$op_id),  # Sequence of operation IDs
  setup = as.numeric(by(dataBoxplot$setup, dataBoxplot$op_id, mean))  # Compute mean 'setup' scores by op_id
)

# Generate the plot using ggplot2
ggplot(dataBoxplot) +
  # Add a boxplot layer for setup scores grouped by operation ID
  geom_boxplot(
    aes(group = op_id, x = op_id, y = setup)  # Map operation ID and setup scores to boxplot
  ) +
  # Add a red line connecting the mean setup values for each operation ID
  geom_line(
    data = dataLine,          # Use the data frame with mean setup values
    aes(op_id, setup),        # Map operation ID to x and mean setup to y
    color = "red"             # Set line color to red
  ) +
  # Add red points for the mean setup values
  geom_point(
    data = dataLine,          # Use the data frame with mean setup values
    aes(op_id, setup),        # Map operation ID to x and mean setup to y
    color = "red",            # Set point color to red
    shape = 16                # Use solid circle for points
  ) +
  annotate(
    "text",
    x = 1:max(dataBoxplot$op_id), y = 10,
    label = paste0("(n=", table(dataBoxplot$op_id),")")
  ) +
  # Add a subtitle with the p-value from a linear model (setup ~ op_id)
  labs(subtitle = paste0("p=", round(summary(lm(setup ~ op_id, dataBoxplot))$coefficient[2, 4], 4))) +
  # Label the y-axis
  ylab("Surgery setup grade (out of 10)") +
  # Label the x-axis
  xlab("Number of surgeries") +
  # Customize the y-axis scale
  scale_y_continuous(
    limits = c(0, 10.5),         # Set y-axis limits between 0 and 10
    breaks = 0:10,             # Add breaks at each integer value
    expand = c(0, 0)           # Remove padding on the axis
  ) +
  # Customize the x-axis scale
  scale_x_discrete(
    limits = 1:max(dataBoxplot$op_id),  # Limit x-axis to the range of operation IDs
    breaks = 1:max(dataBoxplot$op_id)  # Add breaks at each operation ID
  ) +
  # Apply a minimal theme for clean visualization
  theme_minimal()

Jitter boxplots fixes one of the classic boxplots downside by adding the points to give a better idea of the data distribution :

dataBoxplot <- data.frame(
  y = rnorm(200), 
  Group = sample(LETTERS[1:3], size = 200, replace = TRUE)
)

# Box plot by group with jitter
ggplot(dataBoxplot, aes(x = Group, y = y, colour = Group)) + 
  geom_boxplot(outlier.shape = NA) + # Box plot without showing outliers
  geom_jitter(width = 0.2) +         # Add jittered points for individual observations
  theme_minimal() +                  # Use a minimal theme for clean visualization
  xlab("Group") +                    # X-axis label
  ylab("Some randomly generated values") # Y-axis label

Adding some two-by-two testing :

# Simulate data for a boxplot
dataBoxplot <- data.frame(
  id = rep(1:47, 5),  # 47 subjects, repeated 5 times for each group
  times = rep(paste0("Month ", c(0, 3, 6, 9, 12)), each = 47),  # 5 time points
  val = c(  # Simulate values for each time point with different means and SDs
    rnorm(n = 47, mean = 2, sd = 1),  # Group 1 (mean=2, sd=1)
    rnorm(n = 47, mean = 5, sd = 1),  # Group 2 (mean=5, sd=1)
    rnorm(n = 47, mean = 5, sd = 2),  # Group 3 (mean=5, sd=2)
    rnorm(n = 47, mean = 8, sd = 2),  # Group 4 (mean=8, sd=2)
    rnorm(n = 47, mean = 8, sd = 5)   # Group 5 (mean=8, sd=5)
  )
)
dataBoxplot$val <- pmax(0, dataBoxplot$val)

# Perform ANOVA to test differences between the groups
res.aov <- anova_test(
  data = dataBoxplot,
  dv = val,     # Dependent variable: val (size)
  wid = id,     # Within-subjects factor: id
  between = times  # Between-subjects factor: times
)

# Pairwise t-tests to compare each group with group 1, adjusting for multiple comparisons using Bonferroni correction
pwc <- dataBoxplot %>%
  pairwise_t_test(
    val ~ times,  # Dependent variable 'val' across levels of 'times'
    paired = TRUE,  # Paired samples
    ref.group = "Month 0",  # Use "n°1" as the reference group
    p.adjust.method = "bonferroni"  # Apply Bonferroni correction to p-values
  ) %>%
  add_xy_position(x = "times")  # Add xy-position for displaying p-values

# Create the boxplot with individual data points and p-values
ggboxplot(
  dataBoxplot,  # Data for the boxplot
  x = "times",  # Group by 'times'
  y = "val",    # Plot 'val' (size) on the y-axis
  add = "point",  # Add individual data points to the boxplot
  ylab = "Size",  # Label for the y-axis
  xlab = ""      # No label for the x-axis
) +
  scale_y_continuous(
    limits = c(0, 25),
    expand = c(0, 0)
  ) +
  stat_pvalue_manual(pwc) +  # Add p-values from pairwise t-tests
  labs(
    subtitle = get_test_label(res.aov, detailed = FALSE),  # Subtitle for ANOVA result
    caption = "2 by 2 testing: Student's t-test, p-value adjustment: Bonferroni"  # Caption explaining test details
  )

2.3.2 Violins

Switching to violins plots to add some info and some nice colors :

ggstatsplot::ggbetweenstats(ISLR::Wage, education, wage, "np") +
  ylab("Workers raw wage") +
  xlab("Education level")

3 Models outputs

This section is dedicated to regression models outputs and other associated graph.

3.1 Survival curve

Starting with the survival analysis, the most classic graph is the descending survival curve :

# Simulate survival data
n <- 1000  # Number of individuals
dataSurvCurve <- data.frame(
  id = 1:n,  # Unique identifier for each individual
  time = as.numeric(sample(1:365, n, replace = TRUE)),  # Simulated follow-up time (1 to 365 days)
  event = factor(c(
    sample(0:1, n/2, prob = c(0.4, 0.6), replace = TRUE),  # Events for the unexposed group
    sample(0:1, n/2, prob = c(0.7, 0.3), replace = TRUE)   # Events for the exposed group
  )),  # Event status as a factor: 0 = censored, 1 = event
  exposure = rep(c("Unexposed", "Exposed"), each = n/2)  # Exposure groups
)

# Adjust time for censored observations (event == 0)
dataSurvCurve$time[dataSurvCurve$event == 0] <- 365

# Fit the survival model with event stratified by exposure
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve)

# Define the upper limit of the y-axis in percentage
ymax <- 30

# Plot survival curves using autoplot from the ggplot2 framework
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +  # Start with a blank base plot
  geom_step(aes(linetype = strata, color = event), size = 0.8) +  # Add step lines by strata and event
  scale_linetype_manual(
    name = "Group",                     # Legend title for group
    values = c('dotted', 'solid'),      # Line types for the groups
    labels = c('Non-exposed', 'Exposed') # Labels for the groups
  ) +
  scale_color_manual(
    name = "Event",                     # Legend title for events
    values = c("white", "#1C4073"),     # Colors for censored and events
    labels = c(' ', "Hospitalised")     # Labels for the events
  ) +
  labs(
    x = "Follow up period (days)",      # X-axis label (French: Follow-up period in days)
    y = "Proportion of population",     # Y-axis label
    title = ""                          # Empty title
  ) +
  geom_vline(aes(xintercept = 0), size = 1) +  # Add vertical line at x = 0
  geom_hline(aes(yintercept = 1 - ymax / 100), size = 1) +  # Add horizontal line for the ymax threshold
  scale_y_reverse(
    limits = 1 - c(ymax / 100, 1),      # Reverse y-axis to represent decreasing survival
    breaks = 1 - seq(ymax / 100, 1, 0.1), # Break points for y-axis
    labels = paste0(seq(ymax, 100, 10), "%"), # Labels as percentages
    expand = c(0, 0)                    # No expansion around the limits
  ) +
  scale_x_continuous(
    breaks = seq(0, 360, 60),           # X-axis breaks every 60 days
    expand = c(0, 0)                    # No expansion around the limits
  ) +
  theme_minimal() +                      # Use a minimal theme
  theme(
    plot.title = element_text(hjust = 0.5),                        # Center align the title
    axis.title.x = element_text(face = "bold", colour = "black", size = 12), # Style x-axis title
    axis.title.y = element_text(face = "bold", colour = "black", size = 12), # Style y-axis title
    legend.title = element_text(face = "bold", size = 10),         # Style legend title
    panel.grid.major.y = element_line(colour = "lightgray", size = 0.3), # Grid for y-axis
    panel.grid.minor.x = element_blank(),                          # Remove minor grid for x-axis
    legend.position = c(0.15, 0.30),                               # Position legend inside plot
    legend.background = element_rect(fill = "transparent", color = "white") # Transparent background for legend
  )

Second version of almost the same curve, adding some confidence interval using ggsurvplot() :

# Simulate survival data for analysis
n <- 1000  # Number of individuals
dataSurvCurve <- data.frame(
  id = 1:n,  # Unique identifier for each individual
  time = as.numeric(sample(1:365, n, replace = TRUE)),  # Simulated follow-up time between 1 and 365 days
  event = c(
    sample(0:1, n/2, prob = c(0.4, 0.6), replace = TRUE),  # First group: 40% censored, 60% events
    sample(0:1, n/2, prob = c(0.7, 0.3), replace = TRUE)   # Second group: 70% censored, 30% events
  ), 
  # Event status: 0 = censored, 1 = event occurred
  exposure = rep(c("Unexposed", "Exposed"), each = n/2)  # Exposure status: "Unexposed" or "Exposed"
)

# Fit survival curves stratified by exposure
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve)

# Plot the survival curves using ggsurvplot
ggsurvplot(
  fit,                          # Survival fit object
  data = dataSurvCurve,         # Data source
  legend.title = "Group",       # Title for the legend
  legend.labs = c("Exposed", "Unexposed"),  # Labels for legend groups
  conf.int = TRUE,              # Display confidence intervals
  censor = FALSE,               # Do not display censoring marks
  xlab = "Follow up time (days)",  # Label for x-axis
  ylab = "Non-hospitalisation probability", # Label for y-axis
  break.time.by = 60,           # Breaks on the x-axis every 60 days
  surv.scale = "percent",       # Scale survival probability as percentages
  ylim = c(0, 1),               # Set y-axis limits (0% to 100%)
  axes.offset = FALSE,          # Align axes at the origin
  legend = c(0.12, 0.15)        # Position the legend within the plot
)

Let’s add one more event and have an some components (I would use ggsurvplot(), but it does not handle a survfit object having a non binary event) :

# Simulating survival data
n <- 1000  # Number of individuals
dataSurvCurve <- data.frame(
  id = 1:n,
  time = as.numeric(sample(1:365, n, replace = TRUE)),  # Simulated follow-up time between 1 and 365 days
  event = factor(c(sample(0:2, n/2, prob = c(0.4, 0.4, 0.2), replace = TRUE),
                   sample(0:2, n/2, prob = c(0.2, 0.3, 0.5), replace = TRUE))),  # Event (0 = censored, 1 = sick, 2 = dead)
  exposure = rep(c("Unexposed", "Exposed"), each = n/2)  # Exposure status (Unexposed or Exposed)
)

dataSurvCurve$time[dataSurvCurve$event == 0] <- 365  # Set time to 365 for censored observations (event == 0)

fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve) # Fitting a Survival Model


# Plot a survival curve with `autoplot` and enhance it with additional layers
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) + 
  # Add step lines representing survival curves
  geom_step(
    aes(linetype = strata, color = event), # Map line type to `strata` and color to `event`
    size = 0.8                             # Set line thickness
  ) +
  # Customize the line types for survival curves
  scale_linetype_manual(
    name = "Group",                         # Title for the legend
    values = c('dashed', 'solid'),          # Define line types: dashed for unexposed, solid for exposed
    labels = c('Unexposed', 'Exposed')      # Labels for the legend
  ) +
  # Customize the colors for the event states
  scale_color_manual(
    name = "State",                         # Title for the legend
    values = c("#AAC0AF", "#1C4073", "#B28B84"), # Colors for Healthy, Sick, and Dead states
    labels = c('Healthy', 'Sick', 'Dead')   # Labels for the legend
  ) +
  # Add axis labels and a title
  labs(
    x = "Followup period (days)",           # X-axis label
    y = "Proportion of the population",     # Y-axis label
    title = ""                              # Title (empty for now)
  ) +
  # Add a vertical reference line at x = 0
  geom_vline(
    aes(xintercept = 0), # Position of the line
    size = 1             # Thickness of the line
  ) +
  # Customize the y-axis scale
  scale_y_continuous(
    limits = c(0, 1),                      # Set the y-axis range from 0 to 1
    breaks = seq(0, 1, 0.1),               # Major ticks every 0.1
    labels = paste0(seq(0, 100, 10), "%"), # Convert tick labels to percentages
    expand = c(0, 0)                       # Remove padding on the y-axis
  ) +
  # Customize the x-axis scale
  scale_x_continuous(
    breaks = seq(0, 360, 60),              # Major ticks every 60 days
    expand = c(0, 0)                       # Remove padding on the x-axis
  ) +
  # Apply a minimal theme for a clean appearance
  theme_minimal() +
  # Customize specific theme elements
  theme(
    plot.title = element_text(hjust = 0.5),          # Center-align the plot title
    axis.title.x = element_text(
      face = "bold",                                 # Make x-axis title bold
      colour = "black",                              # Set title color to black
      size = 12                                      # Set font size
    ),
    axis.title.y = element_text(
      face = "bold",                                 # Make y-axis title bold
      colour = "black",                              # Set title color to black
      size = 12                                      # Set font size
    ),
    legend.title = element_text(face = "bold", size = 10), # Style legend titles
    panel.grid.major.y = element_line(
      colour = "lightgray",                          # Light gray major grid lines on the y-axis
      size = 0.3                                     # Set line thickness
    ),
    panel.grid.minor.x = element_blank()             # Remove minor grid lines on the x-axis
  )

Lastly, combining different curves using ggsurvplot() :

# Create a data frame for survival analysis
dataSurvCurve <- data.frame(
  os.time = colon$time,          # Overall survival time from 'colon' dataset
  os.status = colon$status,      # Overall survival event status (1 = event, 0 = censored)
  pfs.time = sample(colon$time), # Progression-free survival time, randomized for illustration
  pfs.status = colon$status,     # Progression-free survival event status (1 = event, 0 = censored)
  sex = colon$sex,               # Patient sex
  rx = colon$rx,                 # Treatment type
  adhere = colon$adhere          # Adherence to treatment
)

# Fit survival curves for progression-free survival (PFS) by treatment type
fit.pfs <- survfit(Surv(pfs.time, pfs.status) ~ rx, data = dataSurvCurve)

# Fit survival curves for overall survival (OS) by treatment type
fit.os <- survfit(Surv(os.time, os.status) ~ rx, data = dataSurvCurve)

# Combine the survival fits into a list for plotting
fits <- list(PFS = fit.pfs, OS = fit.os)

# Plot the survival curves using ggsurvplot
ggsurvplot(
  fits,                          # List of survival fits (PFS and OS)
  data = dataSurvCurve,          # Data used for plotting
  combine = TRUE,                # Combine PFS and OS plots into one
  censor = FALSE,                # Do not display censoring marks
  palette = "jco",               # Use JCO color palette
  surv.scale = "percent",        # Display survival probabilities as percentages
  ylim = c(0, 1),                # Set y-axis limits (0% to 100% survival)
  axes.offset = FALSE,           # Align axes at the origin
  legend = "right",              # Position the legend on the right
  xlab = "Survival time (years)",# Label for x-axis
  xscale = "d_y",                # Scale x-axis from days to years
  break.x.by = 365.25,           # Breaks on the x-axis every year
  legend.title = "Treatment type" # Title for the legend
)

Or display the different cruves in a grid :

fit <- survfit( Surv(time, status) ~ sex, data = colon)

ggsurvplot_facet(fit, colon, facet.by = "rx",
                palette = "jco",                # Combine PFS and OS plots into one
  censor = FALSE,                 # Use JCO color palette
  surv.scale = "percent",        # Display survival probabilities as percentages
  ylim = c(0.3, 1),                # Set y-axis limits (0% to 100% survival)
  axes.offset = FALSE,           # Align axes at the origin
  legend = c("PFS", "OS"),              # Position the legend on the right
  xlab = "Survival time (years)",# Label for x-axis
  xscale = "d_y",                # Scale x-axis from days to years
  break.x.by = 365.25,           # Breaks on the x-axis every year
  legend.title = "Treatment type", # Title for the legend)
  conf.int = TRUE
)

3.2 Forest plots

Forest plots are the go to graph for any type of regression that present results as exponential of it’s coefficients. I really like the existing packages displaying the OR and CI95% so I tried to reproduce it.

# Create the data frame for the forest plot
dataForest <- data.frame(
  var = c("Age", "Age", "Age", "Age", "Sex", "Sex", "Diabetes", "Diabetes", "Smoker", "Smoker"), # Variables
  mod = c("18-24", "25-44", "45-64", "64+", "Male", "Female", "No", "Yes", "No", "Yes"), # Categories or modalities within variables
  estimate = c(NA, 1.05, 1.38, 1.72, NA, 0.87, NA, 2.48, NA, 0.97), # Point estimates (Odds Ratios)
  conf.low = c(NA, 0.92, 1.21, 1.44, NA, 0.72, NA, 2.13, NA, 0.88), # Lower bounds of confidence intervals
  conf.high = c(NA, 1.22, 1.63, 1.87, NA, 1.01, NA, 2.67, NA, 1.05) # Upper bounds of confidence intervals
) %>% 
  mutate(
    id = 1:10, # Assign a unique ID to each row
    label = ifelse(is.na(estimate), "Ref", 
                   paste0(formatC(estimate, 2, format = "f"), " [",
                          formatC(conf.low, 2, format = "f"), "-", 
                          formatC(conf.high, 2, format = "f"), "]")), 
    # Create labels for display: "estimate [conf.low-conf.high]" or "Ref" if missing
    estimate = ifelse(is.na(estimate), 1, estimate), # Replace missing estimates with 1 (reference)
    conf.low = ifelse(is.na(conf.low), 1, conf.low), # Replace missing lower bounds with 1 (reference)
    conf.high = ifelse(is.na(conf.high), 1, conf.high), # Replace missing upper bounds with 1 (reference)
    signif = ifelse(conf.low <= 1 & conf.high >= 1, "p>0.05", "p<0.05") 
    # Determine significance: overlaps 1 (p>0.05) or not (p<0.05)
  )

# Create the forest plot using ggplot2
ggplot(dataForest) + 
  geom_vline(
    xintercept = 1, # Reference line at Odds Ratio = 1
    color = "gray25", 
    linetype = 2 # Dashed line
  ) + 
  geom_vline(
    xintercept = c(0.25, 0.5, 2, 4), # Additional vertical lines for log-scale guidance
    color = "gray75",
    linetype = "dotted"
  ) +
  geom_errorbar(
    aes(
      y = reorder(mod, -id), # Reorder y-axis labels based on ID
      xmin = conf.low, # Lower bound of the confidence interval
      xmax = conf.high # Upper bound of the confidence interval
    ),
    position = position_dodge(0.5), # Adjust error bar position
    width = 0.4, # Width of the error bars
    size = 0.8 # Thickness of the error bars
  ) + 
  geom_point(
    aes(x = estimate, y = reorder(mod, -id), color = signif), # Add points for the estimates
    position = position_dodge(width = 0.5), # Adjust point position
    size = 2.5 # Size of the points
  ) + 
  scale_x_log10( # Use a logarithmic scale for x-axis
    breaks = c(0.5, 1, 2, 4), # Specify tick marks
    limits = c(0.49, 6) # Set x-axis limits
  ) +
  scale_color_manual(
    name = "Significance", # Legend title for significance
    values = c("red", "black") # Colors for significant and non-significant points
  ) +
  facet_grid(
    rows = vars(var), # Create separate panels for each variable
    scales = "free", # Allow scales to adjust freely per panel
    switch = "both" # Switch facet labels to both sides
  ) +
  xlab("Odds ratio") + # Label for the x-axis
  ylab("") + # Remove y-axis label
  theme_classic() + # Use a clean, minimal theme
  theme(panel.grid.major.y = element_line(colour = "lightgray")) + # Add gridlines for readability
  geom_text(
    aes(x = 5, y = reorder(mod, -id), label = dataForest$label), # Add text labels for estimates
  )

Here is a simpler version, with no OR table and a lighter display :

# Create the data frame for the forest plot
dataForest <- data.frame(
  var = c("Age", "Age", "Age", "Age", "Sex", "Sex", "Diabetes", "Diabetes", "Smoker", "Smoker"), # Variables
  mod = c("18-24", "25-44", "45-64", "64+", "Male", "Female", "No", "Yes", "No", "Yes"), # Categories or modalities within variables
  estimate = c(NA, 1.05, 1.38, 1.72, NA, 0.87, NA, 2.48, NA, 0.97), # Point estimates (Odds Ratios)
  conf.low = c(NA, 0.92, 1.21, 1.44, NA, 0.72, NA, 2.13, NA, 0.88), # Lower bounds of confidence intervals
  conf.high = c(NA, 1.22, 1.63, 1.87, NA, 1.01, NA, 2.67, NA, 1.05) # Upper bounds of confidence intervals
) %>% 
  dplyr::filter(!is.na(estimate)) %>%
  mutate( 
    id = 1:6, # Assign a unique ID to each row
    mod = paste0(var, " = ", mod),# Create labels for display
    estimate = ifelse(is.na(estimate), 1, estimate), # Replace missing estimates with 1 (reference)
    conf.low = ifelse(is.na(conf.low), 1, conf.low), # Replace missing lower bounds with 1 (reference)
    conf.high = ifelse(is.na(conf.high), 1, conf.high), # Replace missing upper bounds with 1 (reference)
    signif = ifelse(conf.low <= 1 & conf.high >= 1, "p>0.05", "p<0.05") 
    # Determine significance: overlaps 1 (p>0.05) or not (p<0.05)
  )



# Create the forest plot using ggplot2
ggplot(dataForest) + 
  geom_vline(
    xintercept = 1, # Reference line at Odds Ratio = 1
    color = "gray25", 
    linetype = 2 # Dashed line
  ) + 
  geom_vline(
    xintercept = c(0.25, 0.5, 2, 4), # Additional vertical lines for log-scale guidance
    color = "gray75",
    linetype = "dashed"
  ) +
  geom_errorbar(
    aes(
      y = reorder(mod, -id), # Reorder y-axis labels based on ID
      xmin = conf.low, # Lower bound of the confidence interval
      xmax = conf.high # Upper bound of the confidence interval
    ),
    position = position_dodge(0.5), # Adjust error bar position
    width = 0.2, # Width of the error bars
    size = 0.8 # Thickness of the error bars
  ) + 
  geom_point(
    aes(x = estimate, y = reorder(mod, -id), shape = signif, size = signif), # Add points for the estimates
    position = position_dodge(width = 0.5), # Adjust point position
  ) + 
  scale_x_log10( # Use a logarithmic scale for x-axis
    breaks = c(0.5, 1, 2, 4), # Specify tick marks
    limits = c(0.49, 6) # Set x-axis limits
  ) +
  scale_shape_manual(
    name = "Significance", # Legend title for significance
    values = c(15, 18) # Colors for significant and non-significant points
  ) +
  scale_size_manual(
    name = "Significance", # Legend title for significance
    values = c(4, 2) # Colors for significant and non-significant points
  ) +
  xlab("Odds ratio") + # Label for the x-axis
  ylab("") + # Remove y-axis label
  theme_classic() + # Use a clean, minimal theme
  theme(panel.grid.major.y = element_line(colour = "lightgray", linetype = "dotted")) # Add gridlines for readability